Proyecto Final. 🚀

Jorge Morales, Tomás Sandí y Alejandro Zamora.

5/4/2022

knitr::opts_chunk$set(echo = TRUE)

# Librerias
library(dplyr)
library(tidyverse)
library(PASWR)
library(ggplot2)
library(TSstudio)
library(corrplot)
library(DAAG)
library(mvnormtest)
library(normtest)
library(MVN)
library(TSstudio)

Importacion de datos

datos_originales <- readRDS("chicago.rds")

Descripcion de los datos

summary(datos_originales)
##      city                tmpd             dptp             date           
##  Length:6940        Min.   :-16.00   Min.   :-25.62   Min.   :1987-01-01  
##  Class :character   1st Qu.: 35.00   1st Qu.: 27.00   1st Qu.:1991-10-01  
##  Mode  :character   Median : 51.00   Median : 39.88   Median :1996-07-01  
##                     Mean   : 50.31   Mean   : 40.34   Mean   :1996-07-01  
##                     3rd Qu.: 67.00   3rd Qu.: 55.75   3rd Qu.:2001-04-01  
##                     Max.   : 92.00   Max.   : 78.25   Max.   :2005-12-31  
##                     NA's   :1        NA's   :2                            
##    pm25tmean2      pm10tmean2        o3tmean2         no2tmean2     
##  Min.   : 1.70   Min.   :  2.00   Min.   : 0.1528   Min.   : 6.158  
##  1st Qu.: 9.70   1st Qu.: 21.50   1st Qu.:10.0729   1st Qu.:19.654  
##  Median :14.66   Median : 30.28   Median :18.5218   Median :24.556  
##  Mean   :16.23   Mean   : 33.90   Mean   :19.4355   Mean   :25.232  
##  3rd Qu.:20.60   3rd Qu.: 42.00   3rd Qu.:27.0010   3rd Qu.:30.139  
##  Max.   :61.50   Max.   :365.00   Max.   :66.5875   Max.   :62.480  
##  NA's   :4447    NA's   :242
glimpse(datos_originales)
## Rows: 6,940
## Columns: 8
## $ city       <chr> "chic", "chic", "chic", "chic", "chic", "chic", "chic", "ch~
## $ tmpd       <dbl> 31.5, 33.0, 33.0, 29.0, 32.0, 40.0, 34.5, 29.0, 26.5, 32.5,~
## $ dptp       <dbl> 31.500, 29.875, 27.375, 28.625, 28.875, 35.125, 26.750, 22.~
## $ date       <date> 1987-01-01, 1987-01-02, 1987-01-03, 1987-01-04, 1987-01-05~
## $ pm25tmean2 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
## $ pm10tmean2 <dbl> 34.00000, NA, 34.16667, 47.00000, NA, 48.00000, 41.00000, 3~
## $ o3tmean2   <dbl> 4.250000, 3.304348, 3.333333, 4.375000, 4.750000, 5.833333,~
## $ no2tmean2  <dbl> 19.98810, 23.19099, 23.81548, 30.43452, 30.33333, 25.77233,~
unique(datos_originales$city)
## [1] "chic"
  • city: Ciudad de Chicago (no aporta valor: borrar)
  • tmpd: temperatura en grados fahrenheit (desde los -16 a 92 F)
  • dptp: total de muertes por dia
  • date: Dias (desde el 1987-01-01 al 2005-12-31)
  • pm25tmean2: cantidad promedio de particulas menores a los 2.5 microgramos por metro cubico (mas preligrosas, presenta muchos datos faltantes)
  • pm10tmean2: cantidad promedio de particulas entre 2.5 y 10 microgramos por metro cubico (presenta algunos datos faltantes)
  • o3tmean2: Promedio de Ozono medido en partes por millon
  • no2tmean2: Dioxido de Nitrogeno promedio en partes por millon

Analisis exploratorio

Datos faltantes

# Grafico pm25tmean2 vs tiempo: NA = -50
grafico1 <- datos_originales %>% 
  mutate(na_cero_pm25 = ifelse(is.na(pm25tmean2), -50, pm25tmean2)) %>% 
  dplyr::select(date, na_cero_pm25) 

ts_plot(grafico1)
cant_filas = 6940

cant_na_pm25tmean2 = 4447

cant_na_pm25tmean2/cant_filas #% NAs pm25tmean2
## [1] 0.6407781
cant_na_pm10tmean2 = 242

cant_na_pm10tmean2/cant_filas # % NAs pm10tmean2
## [1] 0.03487032
  • pm25tmean2
    • No hay datos de pm25tmean2 desde 1987 hasta 1998
    • Varios de los datos a partir de 1998 tienen datos faltantes pm25tmean2
    • 64% de la variable pm25tmean2 es faltante (recomendable eliminar la variable)
  • pm10tmean2
    • 3% de la variable pm10tmean2 es faltante (se puede filtrar)

Distribucion de variables

# tmpd
EDA(sample(datos_originales$tmpd      , size = 5000))
## [1] "sample(datos_originales$tmpd, size = 5000)"

## Size (n)  Missing  Minimum   1st Qu     Mean   Median   TrMean   3rd Qu 
## 5000.000    0.000  -16.000   35.500   50.482   51.000   50.934   67.000 
##     Max.   Stdev.     Var.  SE Mean   I.Q.R.    Range Kurtosis Skewness 
##   92.000   19.360  374.793    0.274   31.500  108.000   -0.773   -0.244 
## SW p-val 
##    0.000
EDA(sample((datos_originales$tmpd)^1.2      , size = 5000))
## [1] "sample((datos_originales$tmpd)^1.2, size = 5000)"

## Size (n)  Missing  Minimum   1st Qu     Mean   Median   TrMean   3rd Qu 
## 4985.000   15.000    0.000   72.489  112.669  111.965  113.166  155.342 
##     Max.   Stdev.     Var.  SE Mean   I.Q.R.    Range Kurtosis Skewness 
##  227.272   49.494 2449.648    0.701   82.853  227.272   -0.992   -0.072 
## SW p-val 
##    0.000
data <- datos_originales
data_numeric <- dplyr::select(data, -date, -city)
library(psych)
multi.hist(x = data_numeric, dcol = c("blue", "red"), dlty = c("dotted", "solid"),
           main = "")

  • tmpd
    • NO se presenta normalidad (según prueba de hipótesis al 95% de confianza), principalmente en las colas
    • Sesgo en cola IZQUIERDA (se podria transformar elevando a 1.2)
    • Sin valores extremos
    • Aparentemente BIMODAL
# dptp
EDA(sample(datos_originales$dptp      , size = 5000))
## [1] "sample(datos_originales$dptp, size = 5000)"

## Size (n)  Missing  Minimum   1st Qu     Mean   Median   TrMean   3rd Qu 
## 4998.000    2.000  -25.625   27.200   40.250   39.500   40.720   55.425 
##     Max.   Stdev.     Var.  SE Mean   I.Q.R.    Range Kurtosis Skewness 
##   78.250   18.396  338.398    0.260   28.225  103.875   -0.539   -0.242 
## SW p-val 
##    0.000
EDA(sample((datos_originales$dptp)^1.2      , size = 5000))
## [1] "sample((datos_originales$dptp)^1.2, size = 5000)"

## Size (n)  Missing  Minimum   1st Qu     Mean   Median   TrMean   3rd Qu 
## 4914.000   86.000    0.000   53.125   88.270   84.593   88.202  124.928 
##     Max.   Stdev.     Var.  SE Mean   I.Q.R.    Range Kurtosis Skewness 
##  187.146   43.739 1913.114    0.624   71.803  187.146   -1.017    0.085 
## SW p-val 
##    0.000
  • dptp
    • NO se presenta normalidad, principalmente en las colas
    • Sesgo en cola IZQUIERDA (se podria transformar elevando a 1.2)
    • Presenta valores extremos en cola IZQUIERDA
    • Aparentemente BIMODAL
# pm25tmean2
EDA(sample(datos_originales$pm25tmean2, size = 5000))
## [1] "sample(datos_originales$pm25tmean2, size = 5000)"

## Size (n)  Missing  Minimum   1st Qu     Mean   Median   TrMean   3rd Qu 
## 1791.000 3209.000    1.700    9.775   16.197   14.762   15.543   20.529 
##     Max.   Stdev.     Var.  SE Mean   I.Q.R.    Range Kurtosis Skewness 
##   61.500    8.717   75.979    0.206   10.754   59.800    1.569    1.140 
## SW p-val 
##    0.000
max_1 <- max(datos_originales$pm25tmean2, na.rm = TRUE)
max_2 <- max(filter(datos_originales, pm25tmean2 != max_1)$pm25tmean2, na.rm = TRUE)

EDA(sample(sqrt(filter(datos_originales, !(pm25tmean2 %in% c(max_1, max_2)))$pm25tmean2), size = 5000))
## [1] "sample(sqrt(filter(datos_originales, !(pm25tmean2 %in% c(max_1, "
## [2] "    max_2)))$pm25tmean2), size = 5000)"

## Size (n)  Missing  Minimum   1st Qu     Mean   Median   TrMean   3rd Qu 
## 1811.000 3189.000    1.304    3.130    3.899    3.834    3.866    4.561 
##     Max.   Stdev.     Var.  SE Mean   I.Q.R.    Range Kurtosis Skewness 
##    7.211    1.036    1.073    0.024    1.431    5.907   -0.191    0.429 
## SW p-val 
##    0.000
  • pm25tmean2
    • NO se presenta normalidad
    • Sesgo en cola DERECHA (se podria centrar con RAIZ CUADRADA)
    • Presenta valores extremos en cola DERECHA (recomendable FILTRARLOS)
    • UNIMODAL
# pm10tmean2
EDA(sample(datos_originales$pm10tmean2, size = 5000))
## [1] "sample(datos_originales$pm10tmean2, size = 5000)"

## Size (n)  Missing  Minimum   1st Qu     Mean   Median   TrMean   3rd Qu 
## 4827.000  173.000    2.000   21.500   33.720   30.000   32.310   42.000 
##     Max.   Stdev.     Var.  SE Mean   I.Q.R.    Range Kurtosis Skewness 
##  365.000   18.169  330.127    0.262   20.500  363.000   26.265    2.581 
## SW p-val 
##    0.000
max_1 <- max(datos_originales$pm10tmean2, na.rm = TRUE)
max_2 <- max(filter(datos_originales, pm10tmean2 != max_1)$pm10tmean2, na.rm = TRUE)

EDA(sample(log(filter(datos_originales, !(pm10tmean2 %in% c(max_1, max_2)))$pm10tmean2), size = 5000))
## [1] "sample(log(filter(datos_originales, !(pm10tmean2 %in% c(max_1, "
## [2] "    max_2)))$pm10tmean2), size = 5000)"

## Size (n)  Missing  Minimum   1st Qu     Mean   Median   TrMean   3rd Qu 
## 4835.000  165.000    0.693    3.068    3.384    3.401    3.395    3.738 
##     Max.   Stdev.     Var.  SE Mean   I.Q.R.    Range Kurtosis Skewness 
##    4.949    0.521    0.272    0.007    0.670    4.256    0.424   -0.345 
## SW p-val 
##    0.000
  • pm10tmean2
    • NO se presenta normalidad
    • Sesgo en cola DERECHA (se podria centrar con LOGARITMO)
    • Presenta valores extremos en cola DERECHA (recomendable FILTRARLOS)
    • UNIMODAL
# o3tmean2
EDA(sample(datos_originales$o3tmean2  , size = 5000))
## [1] "sample(datos_originales$o3tmean2, size = 5000)"

## Size (n)  Missing  Minimum   1st Qu     Mean   Median   TrMean   3rd Qu 
## 5000.000    0.000    0.153   10.199   19.453   18.509   18.909   26.995 
##     Max.   Stdev.     Var.  SE Mean   I.Q.R.    Range Kurtosis Skewness 
##   66.588   11.360  129.053    0.161   16.796   66.435    0.032    0.602 
## SW p-val 
##    0.000
EDA(sample(sqrt(datos_originales$o3tmean2)  , size = 5000))
## [1] "sample(sqrt(datos_originales$o3tmean2), size = 5000)"

## Size (n)  Missing  Minimum   1st Qu     Mean   Median   TrMean   3rd Qu 
## 5000.000    0.000    0.601    3.190    4.201    4.312    4.204    5.195 
##     Max.   Stdev.     Var.  SE Mean   I.Q.R.    Range Kurtosis Skewness 
##    8.160    1.352    1.829    0.019    2.005    7.559   -0.572   -0.097 
## SW p-val 
##    0.000
  • o3tmean2
    • NO se presenta normalidad
    • Sesgo en cola DERECHA (se podria centrar con RAIZ CUADRADA)
    • Presenta valores extremos en cola DERECHA
    • UNIMODAL
# no2tmean2
EDA(sample(datos_originales$no2tmean2, size = 5000))
## [1] "sample(datos_originales$no2tmean2, size = 5000)"

## Size (n)  Missing  Minimum   1st Qu     Mean   Median   TrMean   3rd Qu 
## 5000.000    0.000    6.726   19.621   25.168   24.516   24.890   30.001 
##     Max.   Stdev.     Var.  SE Mean   I.Q.R.    Range Kurtosis Skewness 
##   62.480    7.918   62.699    0.112   10.380   55.754    0.516    0.547 
## SW p-val 
##    0.000
EDA(sample(sqrt(datos_originales$no2tmean2), size = 5000))
## [1] "sample(sqrt(datos_originales$no2tmean2), size = 5000)"

## Size (n)  Missing  Minimum   1st Qu     Mean   Median   TrMean   3rd Qu 
## 5000.000    0.000    2.527    4.443    4.965    4.955    4.961    5.489 
##     Max.   Stdev.     Var.  SE Mean   I.Q.R.    Range Kurtosis Skewness 
##    7.904    0.800    0.640    0.011    1.046    5.377    0.039    0.069 
## SW p-val 
##    0.002
  • no2tmean2
    • NO se presenta normalidad
    • Sesgo en cola DERECHA (se podria centrar con RAIZ CUADRADA)
    • Presenta valores extremos en cola DERECHA
    • UNIMODAL

Correlacion

correlaciones <- cor(datos_originales %>% dplyr::select(-c(city)) %>% mutate(date = as.numeric(date)), use = "complete.obs")
  
corrplot(correlaciones, method = "number")

* Pareciera haber relacion entre las variables temperatura, muertes por dia y ozono * Las variables de particulas en el aire aparece correlacionadas entre si y con nitrogeno

Limpieza de datos (data cleaning)

ts_plot(dplyr::select(data,tmpd,date),
        title = "temperatura en grados fahrenheit (desde los -16 a 92 F)",
        Xtitle = "Time",
        Ytitle = "temperatura")
ts_plot(dplyr::select(data,pm10tmean2,date),
        title = "cantidad promedio de particulas menores a los 2.5 microgramos por metro cubico (mas preligrosas)",
        Xtitle = "particulas",
        Ytitle = "temperatura")
ts_plot(dplyr::select(data,pm10tmean2,tmpd,date),
        title = "cantidad promedio de particulas menores a los 2.5 microgramos por metro cubico vs temperatura en grados fahrenheit (desde los -16 a 92 F)",
        Xtitle = "particulas",
        Ytitle = "temperatura")

Prueba de normalidad multivariada.

result <- mvn(data = data_numeric, mvnTest = "mardia")
result$multivariateNormality
##              Test        Statistic p value Result
## 1 Mardia Skewness 3089.53870070102       0     NO
## 2 Mardia Kurtosis 18.4616360991651       0     NO
## 3             MVN             <NA>    <NA>     NO

Los datos no siguen una distribución normal multivariada.

library(GGally)
ggpairs(data, lower = list(continuous = "smooth"),
        diag = list(continuous = "barDiag"), axisLabels = "none")

library(psych)
multi.hist(x = data_numeric, dcol = c("blue", "red"), dlty = c("dotted", "solid"),
           main = "")

# Pruebas estadísticas t-test.

t.test(data$pm10tmean2,data$tmpd) # where y1 and y2 are numeric
## 
##  Welch Two Sample t-test
## 
## data:  data$pm10tmean2 and data$tmpd
## t = -51.267, df = 13611, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -17.04171 -15.78656
## sample estimates:
## mean of x mean of y 
##  33.89521  50.30934
t.test(data$pm10tmean2,mu=3)
## 
##  One Sample t-test
## 
## data:  data$pm10tmean2
## t = 140.73, df = 6697, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 3
## 95 percent confidence interval:
##  33.46484 34.32557
## sample estimates:
## mean of x 
##  33.89521

#—————————————————

Construcción del modelo, seleccionando los predictores de forma aleatoria.

# Multiple Linear Regression Example
fit <- lm(cbind(pm10tmean2, tmpd)  ~  dptp + date + pm25tmean2 +
               pm10tmean2 + pm10tmean2 + o3tmean2 + no2tmean2, data=data)
summary(fit) # show results
## Response pm10tmean2 :
## 
## Call:
## lm(formula = pm10tmean2 ~ dptp + date + pm25tmean2 + pm10tmean2 + 
##     pm10tmean2 + o3tmean2 + no2tmean2, data = data)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -3.816e-14 -1.120e-15 -3.600e-16  3.700e-16  9.852e-13 
## 
## Coefficients:
##               Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)  1.226e-14  6.446e-15  1.902e+00   0.0573 .  
## dptp         2.849e-16  2.721e-17  1.047e+01   <2e-16 ***
## date         3.535e-20  5.100e-19  6.900e-02   0.9447    
## pm25tmean2   1.867e-15  6.224e-17  3.000e+01   <2e-16 ***
## pm10tmean2   1.000e+00  3.955e-17  2.529e+16   <2e-16 ***
## o3tmean2    -4.500e-17  4.453e-17 -1.011e+00   0.3123    
## no2tmean2    1.549e-17  7.267e-17  2.130e-01   0.8312    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.984e-14 on 2481 degrees of freedom
##   (4452 observations deleted due to missingness)
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 2.394e+32 on 6 and 2481 DF,  p-value: < 2.2e-16
## 
## 
## Response tmpd :
## 
## Call:
## lm(formula = tmpd ~ dptp + date + pm25tmean2 + pm10tmean2 + pm10tmean2 + 
##     o3tmean2 + no2tmean2, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.1566  -2.6391  -0.2384   2.5063  14.9304 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.203e+00  1.234e+00   4.218 2.56e-05 ***
## dptp         9.197e-01  5.207e-03 176.651  < 2e-16 ***
## date         1.169e-04  9.759e-05   1.197    0.231    
## pm25tmean2  -3.376e-01  1.191e-02 -28.345  < 2e-16 ***
## pm10tmean2   1.534e-01  7.568e-03  20.272  < 2e-16 ***
## o3tmean2     2.475e-01  8.522e-03  29.038  < 2e-16 ***
## no2tmean2    1.169e-01  1.391e-02   8.409  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.797 on 2481 degrees of freedom
##   (4452 observations deleted due to missingness)
## Multiple R-squared:  0.9607, Adjusted R-squared:  0.9606 
## F-statistic: 1.01e+04 on 6 and 2481 DF,  p-value: < 2.2e-16
coefficients(fit)
##                pm10tmean2          tmpd
## (Intercept)  1.226182e-14  5.2027509050
## dptp         2.849274e-16  0.9197470363
## date         3.535004e-20  0.0001168587
## pm25tmean2   1.867385e-15 -0.3376400888
## pm10tmean2   1.000000e+00  0.1534285870
## o3tmean2    -4.500241e-17  0.2474641875
## no2tmean2    1.548984e-17  0.1169446156
confint(fit, level=0.95)
##                                2.5 %        97.5 %
## pm10tmean2:(Intercept) -3.784924e-16  2.490213e-14
## pm10tmean2:dptp         2.315775e-16  3.382772e-16
## pm10tmean2:date        -9.646340e-19  1.035334e-18
## pm10tmean2:pm25tmean2   1.745330e-15  1.989440e-15
## pm10tmean2:pm10tmean2   1.000000e+00  1.000000e+00
## pm10tmean2:o3tmean2    -1.323246e-16  4.231976e-17
## pm10tmean2:no2tmean2   -1.270102e-16  1.579899e-16
## tmpd:(Intercept)        2.783743e+00  7.621759e+00
## tmpd:dptp               9.095373e-01  9.299567e-01
## tmpd:date              -7.451072e-05  3.082282e-04
## tmpd:pm25tmean2        -3.609980e-01 -3.142822e-01
## tmpd:pm10tmean2         1.385875e-01  1.682697e-01
## tmpd:o3tmean2           2.307531e-01  2.641753e-01
## tmpd:no2tmean2          8.967403e-02  1.442152e-01
new_data <- data[1:1000,]
# compare models
fit1 <- lm(cbind(pm10tmean2, tmpd)  ~  dptp + date +
               pm10tmean2 + pm10tmean2 + o3tmean2 + no2tmean2, data=new_data)
fit2 <- lm(cbind(pm10tmean2, tmpd)  ~  dptp + date +
               pm10tmean2 + no2tmean2, data=new_data)
anova(fit1, fit2)
## Analysis of Variance Table
## 
## Model 1: cbind(pm10tmean2, tmpd) ~ dptp + date + pm10tmean2 + pm10tmean2 + 
##     o3tmean2 + no2tmean2
## Model 2: cbind(pm10tmean2, tmpd) ~ dptp + date + pm10tmean2 + no2tmean2
##   Res.Df Df   Gen.var.  Pillai approx F num Df den Df    Pr(>F)    
## 1    820    3.3966e-14                                             
## 2    821  1 3.9757e-14 0.27187    152.9      2    819 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

El modelo con todas las variables introducidas como predictores tiene un R2 considerable (0.6181), es capaz de explicar el 61,81% de la variabilidad observada en cantidad promedio de particulas entre 2.5 y 10 microgramos por metro cubico. El p-value del modelo es significativo (2.2e-16) por lo que se puede aceptar que el modelo no es por azar.

Ahora podemos usar el comando “anova” para comparar el modelo con todas las variables con el modelo con solo las variables significativas para entender si los resultados son estadísticamente diferentes.

Explorar los datos univariados y bivariados

summary(data)
##      city                tmpd             dptp             date           
##  Length:6940        Min.   :-16.00   Min.   :-25.62   Min.   :1987-01-01  
##  Class :character   1st Qu.: 35.00   1st Qu.: 27.00   1st Qu.:1991-10-01  
##  Mode  :character   Median : 51.00   Median : 39.88   Median :1996-07-01  
##                     Mean   : 50.31   Mean   : 40.34   Mean   :1996-07-01  
##                     3rd Qu.: 67.00   3rd Qu.: 55.75   3rd Qu.:2001-04-01  
##                     Max.   : 92.00   Max.   : 78.25   Max.   :2005-12-31  
##                     NA's   :1        NA's   :2                            
##    pm25tmean2      pm10tmean2        o3tmean2         no2tmean2     
##  Min.   : 1.70   Min.   :  2.00   Min.   : 0.1528   Min.   : 6.158  
##  1st Qu.: 9.70   1st Qu.: 21.50   1st Qu.:10.0729   1st Qu.:19.654  
##  Median :14.66   Median : 30.28   Median :18.5218   Median :24.556  
##  Mean   :16.23   Mean   : 33.90   Mean   :19.4355   Mean   :25.232  
##  3rd Qu.:20.60   3rd Qu.: 42.00   3rd Qu.:27.0010   3rd Qu.:30.139  
##  Max.   :61.50   Max.   :365.00   Max.   :66.5875   Max.   :62.480  
##  NA's   :4447    NA's   :242
pairs(data_numeric)

Creamos el vector de variables respuesta en función de las predictoras, y corremos el modelo de regresión multivariado.

mlm1 <- lm(cbind(tmpd, dptp) ~ pm25tmean2 + pm10tmean2 + o3tmean2 + no2tmean2, data = data)
summary(mlm1)
## Response tmpd :
## 
## Call:
## lm(formula = tmpd ~ pm25tmean2 + pm10tmean2 + o3tmean2 + no2tmean2, 
##     data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -52.677  -8.715   2.135  10.380  33.556 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 33.89284    1.23669  27.406   <2e-16 ***
## pm25tmean2  -0.05942    0.04363  -1.362    0.173    
## pm10tmean2   0.41926    0.02745  15.274   <2e-16 ***
## o3tmean2     0.88518    0.02843  31.139   <2e-16 ***
## no2tmean2   -0.50274    0.04981 -10.094   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.05 on 2483 degrees of freedom
##   (4452 observations deleted due to missingness)
## Multiple R-squared:  0.4608, Adjusted R-squared:  0.4599 
## F-statistic: 530.5 on 4 and 2483 DF,  p-value: < 2.2e-16
## 
## 
## Response dptp :
## 
## Call:
## lm(formula = dptp ~ pm25tmean2 + pm10tmean2 + o3tmean2 + no2tmean2, 
##     data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -52.663  -9.691   1.642  10.834  37.900 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 29.68183    1.29551  22.911  < 2e-16 ***
## pm25tmean2   0.30355    0.04570   6.642 3.79e-11 ***
## pm10tmean2   0.28936    0.02875  10.063  < 2e-16 ***
## o3tmean2     0.69273    0.02978  23.263  < 2e-16 ***
## no2tmean2   -0.67393    0.05218 -12.917  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.72 on 2483 degrees of freedom
##   (4452 observations deleted due to missingness)
## Multiple R-squared:  0.3558, Adjusted R-squared:  0.3547 
## F-statistic: 342.8 on 4 and 2483 DF,  p-value: < 2.2e-16

Buscamos entender de forma separada las variables

m1 <- lm(tmpd ~ dptp + pm25tmean2 + pm10tmean2 + o3tmean2, data = data)
summary(m1)
## 
## Call:
## lm(formula = tmpd ~ dptp + pm25tmean2 + pm10tmean2 + o3tmean2, 
##     data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.4361  -2.6675  -0.2679   2.4708  15.2113 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.978628   0.239491   37.49   <2e-16 ***
## dptp         0.908187   0.005082  178.72   <2e-16 ***
## pm25tmean2  -0.313487   0.011679  -26.84   <2e-16 ***
## pm10tmean2   0.181416   0.006881   26.36   <2e-16 ***
## o3tmean2     0.232547   0.008377   27.76   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.85 on 2483 degrees of freedom
##   (4452 observations deleted due to missingness)
## Multiple R-squared:  0.9595, Adjusted R-squared:  0.9594 
## F-statistic: 1.471e+04 on 4 and 2483 DF,  p-value: < 2.2e-16
m2 <- lm(pm25tmean2 ~ tmpd + dptp + pm10tmean2 + o3tmean2 + no2tmean2, data = data)
summary(m2)
## 
## Call:
## lm(formula = pm25tmean2 ~ tmpd + dptp + pm10tmean2 + o3tmean2 + 
##     no2tmean2, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.212  -3.295  -0.509   2.804  33.478 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.29307    0.56314   9.399  < 2e-16 ***
## tmpd        -0.72661    0.02554 -28.448  < 2e-16 ***
## dptp         0.71118    0.02418  29.415  < 2e-16 ***
## pm10tmean2   0.32182    0.01008  31.928  < 2e-16 ***
## o3tmean2     0.10114    0.01427   7.087 1.78e-12 ***
## no2tmean2    0.30490    0.01974  15.448  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.564 on 2482 degrees of freedom
##   (4452 observations deleted due to missingness)
## Multiple R-squared:  0.5895, Adjusted R-squared:  0.5887 
## F-statistic: 712.9 on 5 and 2482 DF,  p-value: < 2.2e-16
lr.model.prob <- predict(m1, type = "response")
lr.model.prob[1:30]
##     4023     4029     4035     4041     4047     4053     4059     4065 
## 52.17694 19.13518 23.55272 29.19331 36.25348 35.94015 43.07712 42.73895 
##     4071     4077     4083     4089     4095     4101     4107     4110 
## 41.90687 40.75831 36.07899 19.32836 46.27270 34.93974 69.82903 45.90900 
##     4111     4112     4113     4114     4115     4116     4117     4118 
## 45.26716 41.86561 36.36528 44.85610 48.64060 49.87723 42.78989 38.57987 
##     4119     4123     4124     4125     4126     4127 
## 46.20144 54.05372 47.74179 40.74245 44.87047 44.21020
# diagnostic plots
new_data <- data[1:100,]
fit1 <- lm(cbind(tmpd)  ~ pm10tmean2 +dptp + date +
               pm10tmean2 + pm10tmean2 + o3tmean2 + no2tmean2, data=new_data)
layout(matrix(c(1,2,3,4),2,2)) # optional 4 graphs/page
plot(fit1)